package com.kdtech.analyse.Bbs;
import com.kdtech.crawler.at.UrlArgumentTop;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.DoMainUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.NumberUtils;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.kdtech.crawler.CrawlHTML;

import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.RegexUtils;
import com.kdtech.utils.StringUtils;

/**
 * http://www.boluo.ccoo.cn/forum/ 博罗论坛
 * http://www.huiya.ccoo.cn/forum/ 惠阳大亚湾论坛
 * http://www.huizhou.ccoo.cn/forum/ 惠州在线论坛
 * @author abc
 */
public class CcooBbsAnalyse implements AnalyseNews{

	
	public boolean isDetailPage(String url) {
		String[] regex = {
//				"http://www.boluo.ccoo.cn/forum/thread-[0-9]*-1-1.html",
//				"http://www.huiya.ccoo.cn/forum/thread-[0-9]*-1-1.html",
//				"http://www.huizhou.ccoo.cn/forum/thread-[0-9]*-1-1.html",
				"http://.*/forum/thread-[0-9]*-1-1.html",
//				"http://www.i0663.cn/forum/thread-[0-9]*-1-1.html",
//				"http://www.0817lz.com/forum/thread-[0-9]*-1-1.html"
				};
		return RegexUtils.matchAnyIgnoreCase(url, regex);
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		NewsMeta bbs = new NewsMeta();
		if (urlMeta.getHtml() == null) {
		}
		String htmltxt = urlMeta.getHtml();
		String url = urlMeta.getUrl();
		if(!isDetailPage(url)){
		}
		bbs.setUrl(url);

		String author = null;
		String title = null;
		Long date = null;
		String content = null;
		Integer commentNum = 0;
		Integer clickNum = 0;
		Document doc = Jsoup.parse(htmltxt);
		title = doc.select("title").text();
		if(StringUtils.isNotBlank(title)){
			title = StringUtils.substringBefore(title, "_");
		}

		Element firstDiv = doc.select("div#topic_o").first();
		if(firstDiv!=null){
			//去除未登录的广告
			firstDiv.getElementsByClass("topic_nologin").remove();
			content = HtmlCleaner.getContentHtml(url, firstDiv.select(".topic_c"));
			date = DateUtils.matchDate(firstDiv.select("div.menber").text());
			String text = firstDiv.select("div.menber").text();
			author = StringUtils.substringBetween(text, "作者：", "阅读：");
			if(StringUtils.isBlank(author)){
				author=firstDiv.select("div.topic_name").text();
			}
			commentNum=NumberUtils.matchNumber(StringUtils.substringBetween(text, "回复：", "发表于："));
			clickNum=NumberUtils.matchNumber(StringUtils.substringBetween(text, "阅读：", "回复："));
		}
		if(StringUtils.isBlank(title)){
		}
		bbs.setAuthor(author);
		bbs.setCommentNum(commentNum);
		bbs.setClickNum(clickNum);
		bbs.setContent(content);
		bbs.setDate(date);
		bbs.setTitle(title);
		bbs.setType(4);

		return bbs;
	}

	
	public NewsMeta Update(NewsMeta meta) {
		if(meta!=null){
			String updateUrl = meta.getUpdateUrl();
			if(updateUrl!=null){
				UrlMeta responseToURL = CrawlHTML.responseToURL(updateUrl);
				if(responseToURL!=null){
					String htmltxt = responseToURL.getHtml();
					Document doc = Jsoup.parse(htmltxt);
					Integer commentNum=0;
					Integer clickNum=0;
					Element firstDiv = doc.select("div#topic_o").first();
					if(firstDiv!=null){
						String text = firstDiv.select("div.menber").text();
						commentNum=NumberUtils.matchNumber(StringUtils.substringBetween(text, "回复：", "发表于："));
						clickNum=NumberUtils.matchNumber(StringUtils.substringBetween(text, "阅读：", "回复："));
					}
					if(commentNum>0 && clickNum>0){
						meta.setCommentNum(commentNum);
						meta.setClickNum(clickNum);
					}
				}
			}
		}
		return meta;
	}

	
	
}
